suppressPackageStartupMessages({
library("EnhancedVolcano")
library("limma")
library("biomaRt")
library("annotables")
library("tximport")
library("apeglm")
library("eulerr")
library("DESeq2")
library("HGNChelper")
library("tictoc")
library("DESeq2")
library("kableExtra")
library("beeswarm")
library("missMethyl")
library("gridExtra")
library("png")
library("metafor")
library("ggplot2")
library("purrr")
library("metafor")
library("AnnotationDbi")
library("readxl")
library("ggplot2")
library("tidyverse")
library("magrittr")
library("readr")
library("eulerr")
library("RColorBrewer")
library("e1071")
library("plotly")
library("pheatmap")
library("msigdbr")
library("mitch")
library("rtracklayer")
library("dplyr")
library("org.Mm.eg.db")
library("fgsea")
})
CORES=16
I ran two kallisto alignments for each sample : one against the cDNA-only index and one against the cDNA + ncRNA index and compared the pseudoalignment rates. The runs using the cDNA + ncRNA index consistently showed higher alignment percentages, so I selected this index for all downstream analyses.
base<- "/mnt/vol1/Mouse_model_RNA_Seq/kallisto_results_plus_ncRNA/"
idx<- "/mnt/vol1/Mouse_model_RNA_Seq/index"
samples<- read_tsv(file.path(base,"samples.txt"),col_names = "sample", show_col_types = FALSE)
samples$path <- file.path(base, samples$sample, "abundance.tsv")
align<- read_delim("/mnt/vol1/Mouse_model_RNA_Seq/kallisto_results_plus_ncRNA/alignment_summary.tsv")
old_align<-read_csv("/mnt/vol1/Mouse_model_RNA_Seq/kallisto_results/aligment_no_ncrna.csv", show_col_types = FALSE)
new_df<- align%>% mutate(index = "cDNA + ncRNA", rate = 100 * pseudoaligned/processed) %>%
select(sample, index, rate)
old_df <- old_align %>% mutate(index = "cDNA only", rate = 100 * pseudoaligned/processed) %>%
select(sample, index, rate)
df<- bind_rows(new_df, old_df)
delta<- df %>% pivot_wider(names_from = index, values_from = rate) %>%
mutate(delta = `cDNA + ncRNA` - `cDNA only`) %>%
arrange(delta)
df$sample <- factor(df$sample, levels = delta$sample)
ggplot(df, aes(sample, rate, fill = index)) +
geom_col(position = position_dodge(width = 0.75), width = 0.7) +
labs(title = "Pseudoalignment rate by sample",
x = "Sample", y = "Rate (%)", fill = "Run") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
tx2gene<- read.csv(file.path("/mnt/vol1/Mouse_model_RNA_Seq/tx2gene_r115.csv"), stringsAsFactors = FALSE)
stopifnot(all(c("TXNAME","GENEID") %in% names(tx2gene)))
files<- file.path(base, samples$sample, "abundance.tsv")
names(files) <- samples$sample
stopifnot(all(file.exists(files)))
txi<- tximport(files,
type = "kallisto",
tx2gene = tx2gene[, c("TXNAME","GENEID")],
countsFromAbundance = "lengthScaledTPM",
ignoreTxVersion = TRUE)
txi$counts[1:2, 1:6]
## BB329 BB330 BB331 BB332 BB334 BB335
## ENSMUSG00000000001 3423.529 4452.743 3423.237 3360.252 4088.225 2816.038
## ENSMUSG00000000003 0.000 0.000 0.000 0.000 0.000 0.000
pheno<- read_excel("/mnt/vol1/Mouse_model_RNA_Seq/minipump.xlsx",skip =1 )
colnames(pheno)
## [1] "Sample" "Sex"
## [3] "Intervention" "RNA concentration\r\nng/ul"
## [5] "RIN" "Well Number"
coldata<- pheno %>%
filter(Sample %in% samples$sample) %>%
arrange(match(Sample, samples$sample)) %>%
mutate(
Sex = factor(Sex, levels = c("F","M")), # F as baseline
Treatment = factor(Intervention, levels = c("Saline","Ang-II"))) # saline as baseline
rownames(coldata)<- coldata$Sample
I made a DESeq2 object from the imported counts, added the sample info, and set the design to use sex, treatment, and their interaction.
dds<- DESeqDataSetFromTximport(txi, colData = coldata, design = ~ Sex + Treatment + Sex:Treatment)
I first removed genes with very low total counts across samples (sum < 10), then I estimated size factors in DESeq2 to account for library size differences, and extracted the normalised counts for inspection.
keep<- rowSums(counts(dds)) >= 10
dds<- dds[keep, ]
dds<- estimateSizeFactors(dds)
norm_counts<- counts(dds, normalized = TRUE)
glimpse(norm_counts)
## num [1:41594, 1:17] 3210.9 157.5 1444.1 196.9 10.3 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:41594] "ENSMUSG00000000001" "ENSMUSG00000000028" "ENSMUSG00000000031" "ENSMUSG00000000037" ...
## ..$ : chr [1:17] "BB329" "BB330" "BB331" "BB332" ...
I variance-stabilised the counts and used them to check sample structure. First, I plotted a sample–sample correlation heatmap (annotated by sex and treatment) to spot outliers. Then I ran a PCA, colouring by treatment and shaping by sex, to see if groups separate. After that, I set the baselines (F, Saline) and fit the DESeq2 model with the interaction.
vsd<- vst(dds, blind = TRUE)
ann<- as.data.frame(colData(dds))[, c("Sex","Treatment"), drop = FALSE]
#sample–sample correlation heatmap
pheatmap(cor(assay(vsd)),
annotation_col = ann,
annotation_row = ann)
# PCA (colour = Treatment, shape = sex)
plotPCA(vsd, intgroup = c("Treatment","Sex"))
## Fit model (set baselines)
dds$Sex<- relevel(factor(dds$Sex), "F")
dds$Treatment<- relevel(factor(dds$Treatment), "Saline")
design(dds)<- ~ Sex + Treatment + Sex:Treatment
dds<- DESeq(dds)
resultsNames(dds)
## [1] "Intercept" "Sex_M_vs_F"
## [3] "Treatment_Ang.II_vs_Saline" "SexM.TreatmentAng.II"
# baseline (saline): M vs F
coef_sex<- "Sex_M_vs_F"
# Ang-II vs Saline in females (F is baseline sex)
coef_trt<- "Treatment_Ang.II_vs_Saline"
# (Ang-II effect in M) & (Ang-II effect in F)
coef_int<- "SexM.TreatmentAng.II"
DESeq2 results were obtained for the four contrasts, and log2 fold changes were then shrunk (ashr) to reduce noise from low-count genes. The results were combined and plotted as volcano-style panels to compare raw vs shrunken effects, highlighting genes with FDR <= 0.05 and LFC >= 0.5.
### RAW results ###
# 1. Within each Sex
# 1.1 Female: Ang-II vs Saline
res_F_raw<- results(dds, name = coef_trt)
# 1.2 Male: Ang-II vs Saline
res_M_raw<- results(dds, contrast = list(c(coef_trt, coef_int)))
# 2. Effect of Ang-II treatment between sex
res_int_raw<- results(dds, name = coef_int)
# 3. Baseline (saline): M vs F
res_bsl_raw<- results(dds, name = coef_sex)
# Shrinkage results
res_F_shr<- lfcShrink(dds, coef = coef_trt, type = "ashr")
res_M_shr<- lfcShrink(dds, contrast= list(c(coef_trt, coef_int)), type = "ashr")
res_int_shr<- lfcShrink(dds, coef = coef_int, type = "ashr")
res_bsl_shr<- lfcShrink(dds, coef = coef_sex, type = "ashr")
make_df<- function(raw, shr, label){
data.frame(
gene = rownames(raw),
lfc_raw= raw$log2FoldChange,
lfc_shr = shr$log2FoldChange,
FDR_raw = raw$padj,
contrast = label,
stringsAsFactors = FALSE)
}
res_all<- bind_rows(
make_df(res_F_raw, res_F_shr, "Female: Ang-II vs Saline"),
make_df(res_M_raw, res_M_shr, "Male: Ang-II vs Saline"),
make_df(res_int_raw, res_int_shr,"Interaction: (M & F) treatment effect"),
make_df(res_bsl_raw, res_bsl_shr, "Baseline (Saline): M vs F")
)
# Raw vs Shrunken
res_long<- res_all %>%
mutate(
hit_raw= !is.na(FDR_raw) & FDR_raw<= 0.05 & abs(lfc_raw)>= 0.5,
hit_shr= !is.na(FDR_raw) & FDR_raw<= 0.05 & abs(lfc_shr)>= 0.5
) %>%
dplyr::select(gene, contrast, FDR_raw, lfc_raw, lfc_shr, hit_raw, hit_shr) %>%
tidyr::pivot_longer(
cols = c(lfc_raw, lfc_shr, hit_raw, hit_shr),
names_to= c(".value", "lfc_type"),
names_sep= "_"
) %>%
mutate(
lfc_type = dplyr::recode(lfc_type,
raw = "Raw LFC",
shr = "Shrunken LFC"),
lfc_type = factor(lfc_type, levels = c("Raw LFC", "Shrunken LFC")),
contrast = factor(
contrast,
levels = c(
"Female: Ang-II vs Saline",
"Male: Ang-II vs Saline",
"Interaction: (M & F) treatment effect",
"Baseline (Saline): M vs F"
)
)
)
x_max<- quantile(abs(res_long$lfc), 0.995, na.rm = TRUE)
x_lim<- c(-max(1.5, x_max), max(1.5, x_max))
y_max<- quantile(-log10(res_long$FDR_raw), 0.995, na.rm = TRUE)
y_lim<- c(0, max(5, y_max))
ggplot(res_long, aes(lfc, -log10(FDR_raw), colour = hit)) +
geom_point(alpha = 0.6, size = 1.2) +
geom_vline(xintercept = c(-0.5, 0.5), linetype = 2) +
geom_hline(yintercept = -log10(0.05), linetype = 2) +
coord_cartesian(xlim = x_lim, ylim = y_lim) +
facet_grid(contrast ~ lfc_type, switch = "y") +
scale_colour_manual(values = c("FALSE" = "grey60", "TRUE" = "steelblue4"),
labels = c("FALSE" = "FDR>0.05 or LFC<0.5", "TRUE" = "FDR<=0.05 & LFC>=0.5"),name = NULL) +
labs(
title = "Volcano comparison: Raw vs Shrunken (ashr) with LFC filtering",
x = "log2 fold change",
y = expression(-log[10]("FDR from unshrunken"))) +
theme_minimal(base_size = 11) +
theme(
legend.position = "bottom",
strip.placement = "outside",
strip.background = element_blank(),
strip.text.x = element_text(face = "bold"),
strip.text.y.left = element_text(angle = 0, face = "bold"),
panel.grid.minor = element_blank())
## Without LFC filtering
The same raw vs shrunken comparison was repeated but without applying a fold-change cutoff.
# Raw vs Shrunken
res_long_nolfc<- res_all %>%
mutate(
sig_raw = !is.na(FDR_raw) & FDR_raw <= 0.05,
sig_shr = !is.na(FDR_raw) & FDR_raw <= 0.05
) %>%
dplyr::select(gene, contrast, FDR_raw, lfc_raw, lfc_shr, sig_raw, sig_shr) %>%
tidyr::pivot_longer(
cols= c(lfc_raw, lfc_shr, sig_raw, sig_shr),
names_to= c(".value", "lfc_type"),
names_sep= "_") %>%
mutate(
lfc_type = recode(lfc_type,
raw = "Raw LFC (before)",
shr = "Shrunken LFC (after)"),
lfc_type = factor(lfc_type, levels = c("Raw LFC (before)", "Shrunken LFC (after)")),
contrast= factor(contrast, levels = c(
"Female: Ang-II vs Saline",
"Male: Ang-II vs Saline",
"Interaction: (M & F) treatment effect",
"Baseline (Saline): M vs F"
)))
x_max<- quantile(abs(res_long_nolfc$lfc), 0.995, na.rm = TRUE)
x_lim<- c(-max(1.5, x_max), max(1.5, x_max))
y_max<- quantile(-log10(res_long_nolfc$FDR_raw), 0.995, na.rm = TRUE)
y_lim<- c(0, max(5, y_max))
ggplot(res_long_nolfc, aes(lfc, -log10(FDR_raw), colour = sig)) +
geom_point(alpha = 0.6, size = 1.2) +
geom_hline(yintercept = -log10(0.05), linetype = 2) +
coord_cartesian(xlim = x_lim, ylim = y_lim) +
facet_grid(contrast ~ lfc_type, switch = "y") +
scale_colour_manual(
values = c("FALSE" = "grey60", "TRUE" = "steelblue4"),
labels = c("FALSE" = "FDR > 0.05", "TRUE" = "FDR ≤ 0.05"),
name = NULL) +
labs(
title = "Volcano comparison: Raw vs Shrunken (ashr) without LFC filtering",
x = "log2 fold change",
y = expression(-log[10]("FDR (from unshrunken fit)"))) +
theme_minimal(base_size = 11) +
theme(
legend.position = "bottom",
strip.placement = "outside",
strip.background = element_blank(),
strip.text.x = element_text(face = "bold"),
strip.text.y.left = element_text(angle = 0, face = "bold"),
panel.grid.minor = element_blank())
This section checks whether the treatment effect really differs between males and females.
SexM.TreatmentAng.II (the interaction) = [(Ang-II – Saline) in males] – [(Ang-II – Saline) in females]
The interaction term (SexM.TreatmentAng.II) in the DESeq2 model is a difference-in-differences term. If this coefficient is close to 0 for most genes, it means Ang-II changes gene expression by about the same amount in males and females. In that case, each sex can still have treatment-responsive genes, but the extra difference between sexes is small.
To test this formally, a likelihood ratio test (LRT) was run. The LRT is comparing the full model to the reduced model to identify significant genes.
Full model: ~ Sex + Treatment + Sex:Treatment
Reduced model: ~ Sex + Treatment
The p-values are determined solely by the difference in deviance between the ‘full’ and ‘reduced’ model formula (not log2 fold changes). Essentially the LRT test is testing whether the term(s) removed in the ‘reduced’ model explains a significant amount of variation in the data. Quoted: https://hbctraining.github.io/DGE_workshop/lessons/08_DGE_LRT.html
Results:
So in the case, where females are baseline/ reference; * β_int > 0: Ang-II effect is larger in males than females.
Results of the preliminary states that;
Group sizes: F (4 Ang-II, 3 Saline), M (4 Ang-II, 6 Saline) = unbalanced and small
F vs M LFC scatter: The Pearson correlation between male and female treatment LFCs was small (r ~ 0.056). Because the vast majority of genes have LFCs close to zero, this low correlation is likely could be die to the many near-zero values rather than by strong, opposing effects.
Summary of the full model (with interaction) to the reduced model (without interaction) gene-by-gene, shows ~0–4 genes at FDR <0.1 (and 0–1 at FDR < 0.05) out of 41594 genes. Hence it could be stated that the interaction term adds almost no explanatory power overall.
Effect-size difference (M & F) distribution of median −0.023, centred near 0, further proving that Ang-II effect size is similar in males and females overall.
The histogram of LFC shows that the values of the res_F_raw and res_M_raw is centred around zero(median −0.023), indicating that Ang-II effects are broadly similar between sexes, consistent with the likelihood-ratio test showing negligible evidence for an interaction
# No.of saline and treatment samples of each sex
table(pheno$Sex, pheno$Intervention)
##
## Ang-II Saline
## F 4 3
## M 4 6
# correlation of the interaction
summary(res_F_raw$lfcSE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.05636 0.17710 0.40386 0.67548 0.90210 4.80267
summary(res_M_raw$lfcSE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04774 0.15084 0.35101 0.58459 0.78546 4.05917
cor.test(res_F_raw$log2FoldChange, res_M_raw$log2FoldChange, use="complete.obs")
##
## Pearson's product-moment correlation
##
## data: res_F_raw$log2FoldChange and res_M_raw$log2FoldChange
## t = 11.464, df = 41592, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.04653857 0.06569855
## sample estimates:
## cor
## 0.05612373
lfc_F<- res_F_raw$log2FoldChange
lfc_M<- res_M_raw$log2FoldChange
delta<- lfc_M - lfc_F
summary(delta)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -30.00000 -0.32066 -0.02349 -0.03099 0.33085 21.20516
# Model without interaction term (Hypothesis testing: Likelihood ratio test (LRT))
dds_lrt<- DESeq(dds, test = "LRT", reduced = ~ Sex + Treatment)
lrt<- results(dds_lrt)
summary(lrt)
##
## out of 41594 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 3, 0.0072%
## LFC < 0 (down) : 1, 0.0024%
## outliers [1] : 21, 0.05%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
lrt05<- results(dds_lrt, alpha = 0.05)
summary(lrt05)
##
## out of 41594 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 0, 0%
## LFC < 0 (down) : 1, 0.0024%
## outliers [1] : 21, 0.05%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Distribution of M & F interaction LFC
hist(res_int_raw$log2FoldChange, breaks=80)
Heatmaps were made to show the genes with the strongest evidence of
differential expression in each contrast.
For every contrast, the 40 genes with the smallest FDR were taken,
variance-stabilised counts were scaled per gene, and samples were
labelled by Sex and Treatment.
top_heatmap<- function(res_shr, vsd, label, n=40) {
keep<- order(res_shr$padj, na.last = NA)[1:min(n, sum(!is.na(res_shr$padj)))]
genes<- rownames(res_shr)[keep]
mat<- assay(vsd)[genes, , drop=FALSE]
mat<- scale(t(scale(t(mat))))
ann<- as.data.frame(colData(vsd))[, c("Sex","Treatment"), drop=FALSE]
pheatmap(mat, annotation_col = ann, show_rownames = TRUE,
main = paste0("Top ", n, " DE genes: ", label))
}
top_heatmap(res_F_shr, vsd, "F Ang-II vs Saline")
top_heatmap(res_M_shr, vsd, "M Ang-II vs Saline")
top_heatmap(res_int_shr, vsd, "Interaction (M & F)")
top_heatmap(res_bsl_shr, vsd, "Baseline (M vs F, Saline)")
# Female-only view of female contrast genes
female_cols<- colData(vsd)$Sex == "F"
top_heatmap(res_F_shr, vsd[, female_cols], "F Ang-II vs Saline (female samples)")
# Male-only view of male contrast genes
male_cols<- colData(vsd)$Sex == "M"
top_heatmap(res_M_shr, vsd[, male_cols], "M Ang-II vs Saline (male samples)")
Here the same idea is used, but only genes with FDR <= 0.05 are kept first, and then up to 40 of those are plotted. Some contrasts had fewer than 40 genes at FDR <= 0.05, so the heatmap shows all available significant genes for those contrasts.
top_heatmap_sig<- function(res_shr, vsd, label, n=40, alpha=0.05){
sig<- which(!is.na(res_shr$padj) & res_shr$padj <= alpha)
if (length(sig) == 0) {
message("No genes pass FDR <= ", alpha, " for: ", label)
return(invisible(NULL))
}
keep<- head(sig[order(res_shr$padj[sig])], n)
genes<- rownames(res_shr)[keep]
mat<- assay(vsd)[genes, , drop=FALSE]
mat<- scale(t(scale(t(mat))))
ann<- as.data.frame(colData(vsd))[, c("Sex","Treatment"), drop=FALSE]
pheatmap(mat, annotation_col=ann, show_rownames=TRUE,
main=sprintf("Top %d genes (FDR <= %.2g): %s", length(genes), alpha, label))
}
top_heatmap_sig(res_F_shr, vsd, "F Ang-II vs Saline")
top_heatmap_sig(res_M_shr, vsd, "M Ang-II vs Saline")
top_heatmap_sig(res_bsl_shr, vsd, "Baseline (M vs F, Saline)")
sum(res_int_shr$padj <= 0.05, na.rm=TRUE)
## [1] 1
# Female-only view of female contrast genes
female_cols<- colData(vsd)$Sex == "F"
top_heatmap_sig(res_F_shr, vsd[, female_cols], "F Ang-II vs Saline (female samples)")
# Male-only view of male contrast genes
male_cols<- colData(vsd)$Sex == "M"
top_heatmap_sig(res_M_shr, vsd[, male_cols], "M Ang-II vs Saline (male samples)")
Context: interpretation of the four contrasts
MSigDB “Hallmarks” (collection H) = 50 core biological with minimal overlap gene sets that capture broad, well-defined biological processes.
# Map Ensembl to gene symbols
gtf<- file.path(idx, "Mus_musculus.GRCm39.115.gtf")
gtf_df<- as.data.frame(rtracklayer::import(gtf))
# For symbols
ann<- gtf_df %>%
filter(type == "gene") %>%
transmute(
ensgene = sub("\\.\\d+$", "", gene_id),
symbol = gene_name) %>%
distinct()
head(ann)
## ensgene symbol
## 1 ENSMUSG00000104478 Gm38212
## 2 ENSMUSG00000104385 Gm7449
## 3 ENSMUSG00000101231 Gm28283
## 4 ENSMUSG00000101097 Gm6679
## 5 ENSMUSG00000100764 Gm29155
## 6 ENSMUSG00000100831 Gm17847
# map ensembl to entrez
map_ensg_to_entrez<- function(ens_ids){
ens_ids <- sub("\\.\\d+$","", ens_ids)
tibble(ENSEMBL = ens_ids) %>%
mutate(ENTREZID = AnnotationDbi::mapIds(
org.Mm.eg.db, keys = ENSEMBL, keytype = "ENSEMBL",
column = "ENTREZID", multiVals = "first")) %>%
filter(!is.na(ENTREZID)) %>% distinct()
}
# rank vectors on entrez
rankify_entrez<- function(res_raw, ann_map){
df<- as.data.frame(res_raw) %>%
rownames_to_column("ensgene") %>%
mutate(ensgene = sub("\\.\\d+$","", ensgene)) %>%
left_join(ann_map, by = c("ensgene"="ENSEMBL")) %>%
filter(!is.na(stat), !is.na(ENTREZID))
split(df$stat, df$ENTREZID) %>% vapply(\(v) v[which.max(abs(v))], numeric(1))
}
ens_all<- unique(c(rownames(res_F_raw), rownames(res_M_raw),
rownames(res_int_raw), rownames(res_bsl_raw)))
ann_entrez<- map_ensg_to_entrez(ens_all)
ranks_F_e<- rankify_entrez(res_F_raw, ann_entrez)
ranks_M_e<- rankify_entrez(res_M_raw, ann_entrez)
ranks_Int_e<- rankify_entrez(res_int_raw, ann_entrez)
ranks_Bas_e<- rankify_entrez(res_bsl_raw, ann_entrez)
# Hallmark pathways grouped by gs_name
msig_entrez<- msigdbr(species="Mus musculus", category="H") %>%
distinct(gs_name, entrez_gene) %>%
group_by(gs_name) %>%
summarise(genes = list(unique(entrez_gene)), .groups = "drop") %>%
tibble::deframe()
# overlap
ov_Fe<- sapply(msig_entrez, function(g) sum(names(ranks_F_e) %in% g)); summary(ov_Fe)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 32.00 95.75 164.50 142.28 195.75 202.00
# GSEA
run_gsea<- function(ranks, label, pathways) {
fgsea::fgseaMultilevel(pathways = pathways, stats = ranks, minSize = 10, maxSize = 2000) %>% arrange(padj) %>% mutate(contrast = label)
}
gsea_F<- run_gsea(ranks_F_e, "F Ang-II vs Saline", msig_entrez)
gsea_M<- run_gsea(ranks_M_e, "M Ang-II vs Saline", msig_entrez)
gsea_Int<- run_gsea(ranks_Int_e, "Interaction (M & F)", msig_entrez)
gsea_Bas<- run_gsea(ranks_Bas_e, "Baseline (M vs F)", msig_entrez)
gsea_all<- bind_rows(gsea_F, gsea_M, gsea_Int, gsea_Bas)
alpha<- 0.05
top_k<- gsea_all %>%
dplyr::filter(!is.na(padj), padj <= alpha) %>%
dplyr::group_by(contrast) %>%
dplyr::arrange(padj, dplyr::desc(abs(NES)), pathway, .by_group = TRUE) %>%
dplyr::slice_head(n = 10) %>%
dplyr::ungroup() %>%
dplyr::mutate(
path = gsub("^HALLMARK_", "", pathway),
path = gsub("_", " ", path),
id = paste(contrast, path, sep = "|")
) %>%
dplyr::group_by(contrast) %>%
dplyr::mutate(id = factor(id, levels = rev(unique(id)))) %>%
dplyr::ungroup()
wrap_lab<- function(x, width = 32) {
vapply(x, function(s) paste(strwrap(s, width = width), collapse = "\n"), character(1))
}
ggplot(top_k, aes(x = id, y = NES, fill = NES > 0)) +
geom_col() +coord_flip() +
facet_wrap(~ contrast, scales = "free_y") +
scale_x_discrete(labels = function(x) wrap_lab(sub("^[^|]*\\|", "", x), width = 32)) +
scale_fill_manual(
values = c("TRUE" = "firebrick", "FALSE" = "steelblue"),
name = "Direction",
labels = c("FALSE" = "Enriched in genes lower in this contrast",
"TRUE" = "Enriched in genes higher in this contrast")) +
labs(
title = sprintf("Top 10 Hallmark pathways per contrast (FDR <=0.05)", alpha),
x = NULL, y = "Normalised Enrichment Score (NES)") +
theme_minimal(base_size = 11) +
theme(legend.position = "bottom")
GO:BP (MSigDB C5, Biological Process) contains a larger set of more specific, ontology-based pathways that describe defined biological activities
# GO: BP pathways (msigdbr), grouped by gs_name
msig_entrez<- msigdbr(species="Mus musculus", category="C5", subcategory = "GO:BP")%>%
distinct(gs_name, entrez_gene) %>%
group_by(gs_name) %>%
summarise(genes = list(unique(entrez_gene)), .groups = "drop") %>%
tibble::deframe()
# overlap
ov_Fe<- sapply(msig_entrez, function(g) sum(names(ranks_F_e) %in% g)); summary(ov_Fe)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 8.0 18.0 73.3 54.0 1825.0
# GSEA
run_gsea<- function(ranks, label, pathways) {
fgsea::fgseaMultilevel(pathways = pathways, stats = ranks, minSize = 10, maxSize = 2000) %>% arrange(padj) %>% mutate(contrast = label)
}
gsea_F<- run_gsea(ranks_F_e, "F Ang-II vs Saline", msig_entrez)
gsea_M<- run_gsea(ranks_M_e, "M Ang-II vs Saline", msig_entrez)
gsea_Int<- run_gsea(ranks_Int_e, "Interaction (M & F)", msig_entrez)
gsea_Bas<- run_gsea(ranks_Bas_e, "Baseline (M vs F)", msig_entrez)
gsea_all<- bind_rows(gsea_F, gsea_M, gsea_Int, gsea_Bas)
top_k<- gsea_all %>%
dplyr::filter(!is.na(padj), padj <= alpha) %>%
dplyr::group_by(contrast) %>%
dplyr::arrange(padj, dplyr::desc(abs(NES)), pathway, .by_group = TRUE) %>%
dplyr::slice_head(n = 10) %>%
dplyr::ungroup() %>%
dplyr::mutate(
path_label = gsub("^GOBP_", "", pathway),
path_label = gsub("_", " ", path_label),
id = paste(contrast, path_label, sep = "|")) %>%
dplyr::group_by(contrast) %>%
dplyr::mutate(id = factor(id, levels = rev(unique(id)))) %>%
dplyr::ungroup()
ggplot(top_k, aes(x = id, y = NES, fill = NES > 0)) +
geom_col() +
coord_flip() +
facet_wrap(~ contrast, scales = "free_y") +
scale_x_discrete(labels = function(x) wrap_lab(sub("^[^|]*\\|", "", x), width = 32)) +
scale_fill_manual(
values = c("TRUE" = "firebrick", "FALSE" = "steelblue"),
name = "Direction",
labels = c("FALSE" = "Enriched among genes lower in this contrast",
"TRUE" = "Enriched among genes higher in this contrast")) +
labs(title = sprintf("Top 10 GO:BP pathways per contrast (FDR <=0.05)", alpha),
subtitle = NULL,
x = NULL, y = "Normalised Enrichment Score (NES)") +
theme_minimal(base_size = 11) +
theme(
legend.position = "bottom",
strip.text = element_text(face = "bold"),
plot.title = element_text(face = "bold"))
deseq_to_prescore <- function(res_obj) {
as.data.frame(res_obj) %>%
rownames_to_column("ensgene") %>%
mutate(ensgene = sub("\\.\\d+$", "", ensgene)) %>%
filter(!is.na(stat)) %>%
dplyr::select(ensgene, score = stat) %>%
group_by(ensgene) %>%
slice_max(order_by = abs(score), n = 1) %>%
ungroup() %>%
column_to_rownames("ensgene")
}
x_F<- deseq_to_prescore(res_F_raw)
x_M <- deseq_to_prescore(res_M_raw)
x_Int<- deseq_to_prescore(res_int_raw)
x_Bas<- deseq_to_prescore(res_bsl_raw)
geneTable<- ann %>%
filter(!is.na(symbol), symbol != "") %>%
dplyr::select(gene = symbol, ensgene)
# Hallmark
msig_hall<- msigdbr(
species= "Mus musculus",
category= "H") %>%
dplyr::select(gs_name, gene_symbol) %>%
distinct() %>%
group_by(gs_name) %>%
summarise(genes = list(gene_symbol), .groups = "drop") %>%
tibble::deframe()
# GO:BP
msig_gobp<- msigdbr(
species= "Mus musculus",
category= "C5",
subcategory= "GO:BP") %>%
dplyr::select(gs_name, gene_symbol) %>%
distinct() %>%
group_by(gs_name) %>%
summarise(genes = list(gene_symbol), .groups = "drop") %>%
tibble::deframe()
# run mitch for one contrast
run_mitch_one<- function(x_mat, genesets, label) {
mobj <- mitch_import(
x= x_mat,
DEtype= "prescored",
geneTable= geneTable)
mres<- mitch_calc(
x = mobj,
genesets= genesets,
minsetsize= 5,
cores= CORES,
priority= "effect")
out<- mres$enrichment_result %>%
mutate(
contrast= label,
set_clean= gsub("_", " ", gsub("^HALLMARK_|^GOBP_", "", set)))
return(out)
}
#4 contrasts (hallmark)
hall_F<- run_mitch_one(x_F, msig_hall, "F Ang-II vs Saline")
hall_M<- run_mitch_one(x_M, msig_hall, "M Ang-II vs Saline")
hall_Int<- run_mitch_one(x_Int, msig_hall, "Interaction (M vs F)")
hall_Bas<- run_mitch_one(x_Bas, msig_hall, "Baseline M vs F")
hall_all<- bind_rows(hall_F, hall_M, hall_Int, hall_Bas)
#sig + top 10 per contrast
hall_top10<- hall_all %>%
filter(!is.na(p.adjustANOVA), p.adjustANOVA < 0.05, setSize >= 5) %>%
arrange(contrast, p.adjustANOVA, desc(abs(s.dist))) %>%
group_by(contrast) %>%
slice_head(n = 10) %>%
ungroup()
hall_top10
## # A tibble: 40 × 7
## set setSize pANOVA s.dist p.adjustANOVA contrast set_clean
## <chr> <int> <dbl> <dbl> <dbl> <chr> <chr>
## 1 HALLMARK_OXIDATIVE_… 185 2.26e-52 0.647 1.13e-50 Baselin… OXIDATIV…
## 2 HALLMARK_ADIPOGENES… 197 2.49e-36 0.519 6.24e-35 Baselin… ADIPOGEN…
## 3 HALLMARK_HEME_METAB… 193 1.75e-23 0.416 2.42e-22 Baselin… HEME MET…
## 4 HALLMARK_MTORC1_SIG… 197 1.93e-23 0.411 2.42e-22 Baselin… MTORC1 S…
## 5 HALLMARK_GLYCOLYSIS 194 1.63e-22 0.405 1.63e-21 Baselin… GLYCOLYS…
## 6 HALLMARK_FATTY_ACID… 151 6.94e-20 0.429 5.79e-19 Baselin… FATTY AC…
## 7 HALLMARK_P53_PATHWAY 196 1.13e-19 0.375 8.08e-19 Baselin… P53 PATH…
## 8 HALLMARK_MYOGENESIS 197 1.69e-18 0.362 1.06e-17 Baselin… MYOGENES…
## 9 HALLMARK_DNA_REPAIR 149 3.10e-17 0.400 1.72e-16 Baselin… DNA REPA…
## 10 HALLMARK_TNFA_SIGNA… 198 8.51e-17 0.342 4.25e-16 Baselin… TNFA SIG…
## # ℹ 30 more rows
#4 contrasts(GO:BP)
gobp_F<- run_mitch_one(x_F, msig_gobp, "F Ang-II vs Saline")
gobp_M<- run_mitch_one(x_M, msig_gobp, "M Ang-II vs Saline")
gobp_Int<- run_mitch_one(x_Int, msig_gobp, "Interaction (M vs F)")
gobp_Bas<- run_mitch_one(x_Bas, msig_gobp, "Baseline M vs F")
gobp_all<- bind_rows(gobp_F, gobp_M, gobp_Int, gobp_Bas)
gobp_top10<- gobp_all %>%
filter(!is.na(p.adjustANOVA), p.adjustANOVA < 0.05, setSize >= 5) %>%
arrange(contrast, p.adjustANOVA, desc(abs(s.dist))) %>%
group_by(contrast) %>%
slice_head(n = 10) %>%
ungroup()
gobp_top10
## # A tibble: 40 × 7
## set setSize pANOVA s.dist p.adjustANOVA contrast set_clean
## <chr> <int> <dbl> <dbl> <dbl> <chr> <chr>
## 1 GOBP_PHOSPHORUS_ME… 1769 1.49e-120 0.327 1.09e-116 Baselin… PHOSPHOR…
## 2 GOBP_SMALL_MOLECUL… 1601 6.18e-104 0.318 2.26e-100 Baselin… SMALL MO…
## 3 GOBP_VESICLE_MEDIA… 1496 1.42e-102 0.326 3.46e- 99 Baselin… VESICLE …
## 4 GOBP_ESTABLISHMENT… 1631 8.05e-101 0.310 1.47e- 97 Baselin… ESTABLIS…
## 5 GOBP_PROTEIN_CONTA… 1710 6.46e- 94 0.292 9.44e- 91 Baselin… PROTEIN …
## 6 GOBP_REGULATION_OF… 1433 4.20e- 93 0.317 5.13e- 90 Baselin… REGULATI…
## 7 GOBP_NITROGEN_COMP… 1783 3.51e- 90 0.281 3.67e- 87 Baselin… NITROGEN…
## 8 GOBP_APOPTOTIC_PRO… 1764 2.72e- 88 0.279 2.49e- 85 Baselin… APOPTOTI…
## 9 GOBP_MACROMOLECULE… 1397 3.11e- 80 0.297 2.53e- 77 Baselin… MACROMOL…
## 10 GOBP_POSITIVE_REGU… 1704 4.37e- 80 0.270 3.20e- 77 Baselin… POSITIVE…
## # ℹ 30 more rows
# Hallmark pathways
hall_top10$contrast<- factor(
hall_top10$contrast,
levels= c(
"F Ang-II vs Saline",
"M Ang-II vs Saline",
"Interaction (M vs F)",
"Baseline M vs F"
)
)
ggplot(hall_top10,
aes(x = reorder(set_clean, s.dist), y = s.dist, fill = s.dist > 0)) +
geom_col() +
coord_flip() +
facet_wrap(~ contrast, scales = "free_y") +
scale_fill_manual(
values= c("TRUE" = "firebrick", "FALSE" = "steelblue"),
name= "Direction",
labels= c("FALSE" = "Enriched among genes lower in this contrast", "TRUE" = "Enriched among genes higher in this contrast")) +
labs(
title = "mitch: Hallmark pathways (top 10 per contrast)",
x = NULL,
y = "mitch s.dist") +
theme_minimal(base_size = 11) +
theme(
legend.position = "bottom",
strip.text = element_text(face = "bold"))
# GO:BP pathways
gobp_top10$contrast<- factor(
gobp_top10$contrast,
levels = c(
"F Ang-II vs Saline",
"M Ang-II vs Saline",
"Interaction (M & F)",
"Baseline M vs F"
)
)
ggplot(gobp_top10,
aes(x = reorder(set_clean, s.dist), y = s.dist, fill = s.dist > 0)) +
geom_col() +
coord_flip() +
facet_wrap(~ contrast, scales = "free_y") +
scale_fill_manual(
values = c("TRUE" = "firebrick", "FALSE" = "steelblue"),
name = "Direction",
labels = c("FALSE" = "Enriched among genes lower in this contrast", "TRUE" = "Enriched among genes higher in this contrast")) +
labs(
title = "mitch: GO:BP pathways (top 10 per contrast)",
x = NULL,
y = "mitch s.dist") +
theme_minimal(base_size = 11) +
theme(
legend.position = "bottom",
strip.text = element_text(face = "bold"))
sessionInfo()
## R version 4.4.3 (2025-02-28)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 20.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8
## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8
## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] fgsea_1.32.4
## [2] org.Mm.eg.db_3.20.0
## [3] rtracklayer_1.66.0
## [4] mitch_1.18.4
## [5] msigdbr_25.1.1
## [6] pheatmap_1.0.13
## [7] plotly_4.11.0
## [8] e1071_1.7-16
## [9] RColorBrewer_1.1-3
## [10] magrittr_2.0.3
## [11] lubridate_1.9.4
## [12] forcats_1.0.1
## [13] stringr_1.5.1
## [14] dplyr_1.1.4
## [15] readr_2.1.5
## [16] tidyr_1.3.1
## [17] tibble_3.2.1
## [18] tidyverse_2.0.0
## [19] readxl_1.4.5
## [20] AnnotationDbi_1.68.0
## [21] purrr_1.1.0
## [22] metafor_4.8-0
## [23] numDeriv_2016.8-1.1
## [24] metadat_1.4-0
## [25] Matrix_1.7-4
## [26] png_0.1-8
## [27] gridExtra_2.3
## [28] missMethyl_1.40.3
## [29] IlluminaHumanMethylationEPICv2anno.20a1.hg38_1.0.0
## [30] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0
## [31] IlluminaHumanMethylation450kanno.ilmn12.hg19_0.6.1
## [32] minfi_1.52.1
## [33] bumphunter_1.48.0
## [34] locfit_1.5-9.12
## [35] iterators_1.0.14
## [36] foreach_1.5.2
## [37] Biostrings_2.74.1
## [38] XVector_0.46.0
## [39] beeswarm_0.4.0
## [40] kableExtra_1.4.0
## [41] tictoc_1.2.1
## [42] HGNChelper_0.8.15
## [43] DESeq2_1.46.0
## [44] SummarizedExperiment_1.36.0
## [45] Biobase_2.66.0
## [46] MatrixGenerics_1.18.1
## [47] matrixStats_1.5.0
## [48] GenomicRanges_1.58.0
## [49] GenomeInfoDb_1.42.3
## [50] IRanges_2.40.1
## [51] S4Vectors_0.44.0
## [52] BiocGenerics_0.52.0
## [53] eulerr_7.0.4
## [54] apeglm_1.28.0
## [55] tximport_1.34.0
## [56] annotables_0.2.0
## [57] biomaRt_2.62.1
## [58] limma_3.62.2
## [59] EnhancedVolcano_1.24.0
## [60] ggrepel_0.9.6
## [61] ggplot2_4.0.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-9 httr_1.4.7
## [3] tools_4.4.3 doRNG_1.8.6.2
## [5] utf8_1.2.4 R6_2.6.1
## [7] HDF5Array_1.34.0 lazyeval_0.2.2
## [9] rhdf5filters_1.18.1 withr_3.0.2
## [11] prettyunits_1.2.0 GGally_2.4.0
## [13] base64_2.0.2 preprocessCore_1.68.0
## [15] cli_3.6.5 textshaping_1.0.4
## [17] labeling_0.4.3 sass_0.4.10
## [19] SQUAREM_2021.1 mvtnorm_1.3-3
## [21] S7_0.2.0 genefilter_1.88.0
## [23] proxy_0.4-27 askpass_1.2.1
## [25] mixsqp_0.3-54 Rsamtools_2.22.0
## [27] systemfonts_1.3.1 siggenes_1.80.0
## [29] illuminaio_0.48.0 svglite_2.2.2
## [31] rentrez_1.2.4 dichromat_2.0-0.1
## [33] scrime_1.3.5 invgamma_1.2
## [35] bbmle_1.0.25.1 rstudioapi_0.17.1
## [37] RSQLite_2.4.3 generics_0.1.3
## [39] BiocIO_1.16.0 vroom_1.6.5
## [41] gtools_3.9.5 abind_1.4-8
## [43] lifecycle_1.0.4 yaml_2.3.10
## [45] mathjaxr_1.8-0 gplots_3.2.0
## [47] rhdf5_2.50.2 SparseArray_1.6.2
## [49] BiocFileCache_2.14.0 grid_4.4.3
## [51] blob_1.2.4 promises_1.4.0
## [53] crayon_1.5.3 bdsmatrix_1.3-7
## [55] lattice_0.22-5 cowplot_1.2.0
## [57] echarts4r_0.4.6 GenomicFeatures_1.58.0
## [59] annotate_1.84.0 KEGGREST_1.46.0
## [61] pillar_1.11.1 knitr_1.50
## [63] beanplot_1.3.1 rjson_0.2.23
## [65] codetools_0.2-19 fastmatch_1.1-6
## [67] glue_1.8.0 data.table_1.17.8
## [69] vctrs_0.6.5 cellranger_1.1.0
## [71] gtable_0.3.6 assertthat_0.2.1
## [73] emdbook_1.3.14 cachem_1.1.0
## [75] xfun_0.54 mime_0.13
## [77] S4Arrays_1.6.0 coda_0.19-4.1
## [79] survival_3.8-3 statmod_1.5.0
## [81] nlme_3.1-168 bit64_4.6.0-1
## [83] progress_1.2.3 filelock_1.0.3
## [85] bslib_0.9.0 nor1mix_1.3-3
## [87] irlba_2.3.5.1 KernSmooth_2.23-26
## [89] otel_0.2.0 splitstackshape_1.4.8
## [91] DBI_1.2.3 tidyselect_1.2.1
## [93] bit_4.6.0 compiler_4.4.3
## [95] curl_7.0.0 httr2_1.2.1
## [97] xml2_1.4.1 DelayedArray_0.32.0
## [99] caTools_1.18.3 scales_1.4.0
## [101] quadprog_1.5-8 rappdirs_0.3.3
## [103] digest_0.6.37 rmarkdown_2.30
## [105] GEOquery_2.74.0 htmltools_0.5.8.1
## [107] pkgconfig_2.0.3 sparseMatrixStats_1.18.0
## [109] dbplyr_2.5.1 fastmap_1.2.0
## [111] rlang_1.1.6 htmlwidgets_1.6.4
## [113] UCSC.utils_1.2.0 shiny_1.11.1
## [115] DelayedMatrixStats_1.28.1 farver_2.1.2
## [117] jquerylib_0.1.4 jsonlite_2.0.0
## [119] BiocParallel_1.40.2 mclust_6.1.1
## [121] RCurl_1.98-1.17 GenomeInfoDbData_1.2.13
## [123] Rhdf5lib_1.28.0 Rcpp_1.1.0
## [125] babelgene_22.9 stringi_1.8.7
## [127] zlibbioc_1.52.0 MASS_7.3-65
## [129] plyr_1.8.9 org.Hs.eg.db_3.20.0
## [131] ggstats_0.11.0 splines_4.4.3
## [133] multtest_2.62.0 hms_1.1.3
## [135] rngtools_1.5.2 reshape2_1.4.4
## [137] XML_3.99-0.19 evaluate_1.0.5
## [139] tzdb_0.4.0 httpuv_1.6.16
## [141] openssl_2.3.4 reshape_0.8.10
## [143] ashr_2.2-67 xtable_1.8-4
## [145] restfulr_0.0.16 later_1.4.4
## [147] viridisLite_0.4.2 class_7.3-23
## [149] truncnorm_1.0-9 memoise_2.0.1
## [151] GenomicAlignments_1.42.0 timechange_0.3.0